{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a35f4f83",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import warnings\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "warnings.filterwarnings('ignore')\n",
    "plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签\n",
    "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n",
    "import missingno"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "08be683e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df=pd.read_csv('farming.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "cbc59509",
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'农产品市场所在省份'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3629\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3628\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3629\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3630\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\pandas\\_libs\\index.pyx:136\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\pandas\\_libs\\index.pyx:163\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5198\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "File \u001b[1;32mpandas\\_libs\\hashtable_class_helper.pxi:5206\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mKeyError\u001b[0m: '农产品市场所在省份'",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[34], line 7\u001b[0m\n\u001b[0;32m      1\u001b[0m region_code_mapping \u001b[38;5;241m=\u001b[39m {  \n\u001b[0;32m      2\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m北京\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m天津\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m河北\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m辽宁\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m上海\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m江苏\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m浙江\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m福建\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m山东\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m广东\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m1\u001b[39m,  \u001b[38;5;66;03m# 东部  \u001b[39;00m\n\u001b[0;32m      3\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m山西\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m吉林\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m黑龙江\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m安徽\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m江西\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m河南\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m湖北\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m湖南\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m2\u001b[39m,  \u001b[38;5;66;03m# 中部  \u001b[39;00m\n\u001b[0;32m      4\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m内蒙古\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m广西\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m重庆\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m四川\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m贵州\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m云南\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m西藏\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m陕西\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m甘肃\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m青海\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m宁夏\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m新疆\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m3\u001b[39m  \u001b[38;5;66;03m# 西部  \u001b[39;00m\n\u001b[0;32m      5\u001b[0m }  \n\u001b[1;32m----> 7\u001b[0m df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m区域\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m农产品市场所在省份\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mmap(region_code_mapping)\n\u001b[0;32m      8\u001b[0m one_hot_encoded_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mget_dummies(df[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m农产品市场所在省份\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m农产品类别\u001b[39m\u001b[38;5;124m'\u001b[39m]])  \n\u001b[0;32m      9\u001b[0m   \u001b[38;5;66;03m# 使用 pd.merge 将独热编码后的 DataFrame 与 df1 合并，基于索引  \u001b[39;00m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\pandas\\core\\frame.py:3505\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m   3503\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[0;32m   3504\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[1;32m-> 3505\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   3506\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[0;32m   3507\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m [indexer]\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\pandas\\core\\indexes\\base.py:3631\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[1;34m(self, key, method, tolerance)\u001b[0m\n\u001b[0;32m   3629\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine\u001b[38;5;241m.\u001b[39mget_loc(casted_key)\n\u001b[0;32m   3630\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m-> 3631\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[0;32m   3632\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[0;32m   3633\u001b[0m     \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[0;32m   3634\u001b[0m     \u001b[38;5;66;03m#  InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[0;32m   3635\u001b[0m     \u001b[38;5;66;03m#  the TypeError.\u001b[39;00m\n\u001b[0;32m   3636\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n",
      "\u001b[1;31mKeyError\u001b[0m: '农产品市场所在省份'"
     ]
    }
   ],
   "source": [
    "region_code_mapping = {  \n",
    "    '北京': 1, '天津': 1, '河北': 1, '辽宁': 1, '上海': 1, '江苏': 1, '浙江': 1, '福建': 1, '山东': 1, '广东': 1,  # 东部  \n",
    "    '山西': 2, '吉林': 2, '黑龙江': 2, '安徽': 2, '江西': 2, '河南': 2, '湖北': 2, '湖南': 2,  # 中部  \n",
    "    '内蒙古': 3, '广西': 3, '重庆': 3, '四川': 3, '贵州': 3, '云南': 3, '西藏': 3, '陕西': 3, '甘肃': 3, '青海': 3, '宁夏': 3, '新疆': 3  # 西部  \n",
    "}  \n",
    "  \n",
    "df['区域'] = df['农产品市场所在省份'].map(region_code_mapping)\n",
    "one_hot_encoded_df = pd.get_dummies(df[['农产品市场所在省份', '农产品类别']])  \n",
    "  # 使用 pd.merge 将独热编码后的 DataFrame 与 df1 合并，基于索引  \n",
    "df= pd.merge(one_hot_encoded_df, df, left_index=True, right_index=True)  \n",
    "df = df.drop(['农产品市场所在省份', '农产品类别'], axis=1)  \n",
    "#市场名称映射值\n",
    "import pandas as pd  \n",
    "from sklearn.preprocessing import LabelEncoder  \n",
    "\n",
    "# 初始化一个 LabelEncoder  \n",
    "label_encoder = LabelEncoder()  \n",
    "   \n",
    "unique_colors = df['市场名称映射值'].unique()  \n",
    "color_to_index = dict(zip(unique_colors, label_encoder.fit_transform(unique_colors)))  \n",
    "index_to_color = dict(zip(label_encoder.transform(unique_colors), unique_colors))  \n",
    "  \n",
    "# 返回原列，并填充编码  \n",
    "df['市场名称映射值'] = df['市场名称映射值'].map(color_to_index) \n",
    "#农产品名称映射值\n",
    "import pandas as pd  \n",
    "from sklearn.preprocessing import LabelEncoder  \n",
    "\n",
    "# 初始化一个 LabelEncoder  \n",
    "label_encoder = LabelEncoder()  \n",
    "  \n",
    "unique_colors = df['农产品名称映射值'].unique()  \n",
    "color_to_index = dict(zip(unique_colors, label_encoder.fit_transform(unique_colors)))  \n",
    "index_to_color = dict(zip(label_encoder.transform(unique_colors), unique_colors))  \n",
    "  \n",
    "# 返回原列，并填充编码  \n",
    "df['农产品名称映射值'] = df['农产品名称映射值'].map(color_to_index) \n",
    "# 转换日期列\n",
    "df['数据入库时间'] = pd.to_datetime(df['数据入库时间'], errors='coerce')\n",
    "df['数据发布时间'] = pd.to_datetime(df['数据发布时间'], errors='coerce')\n",
    "#对日期进行处理\n",
    "df['数据入库年份']= df['数据入库时间'].dt.year\n",
    "df['数据入库月份']= df['数据入库时间'].dt.month\n",
    "df['数据入库日份']= df['数据入库时间'].dt.day\n",
    "\n",
    "df['数据发布年份']= df['数据发布时间'].dt.year\n",
    "df['数据发布月份']= df['数据发布时间'].dt.month\n",
    "df['数据发布日份']= df['数据发布时间'].dt.day\n",
    "\n",
    "df = df.drop(['数据入库时间','数据发布时间'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "9ab62e6f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Column '农产品市场所在省份' does not exist in the DataFrame.\n",
      "Empty DataFrame\n",
      "Columns: [Unnamed: 0, 农产品市场所在省份_上海, 农产品市场所在省份_云南, 农产品市场所在省份_内蒙古, 农产品市场所在省份_北京, 农产品市场所在省份_四川, 农产品市场所在省份_天津, 农产品市场所在省份_安徽, 农产品市场所在省份_山东, 农产品市场所在省份_新疆, 农产品市场所在省份_江苏, 农产品市场所在省份_江西, 农产品市场所在省份_河北, 农产品市场所在省份_河南, 农产品市场所在省份_浙江, 农产品市场所在省份_湖北, 农产品市场所在省份_湖南, 农产品市场所在省份_甘肃, 农产品市场所在省份_福建, 农产品市场所在省份_辽宁, 农产品市场所在省份_陕西, 农产品类别_客菜, 农产品类别_干花, 农产品类别_康乃馨, 农产品类别_本地菜, 农产品类别_果品, 农产品类别_水产, 农产品类别_永生花, 农产品类别_玫瑰花, 农产品类别_畜禽蛋品, 农产品类别_粮油饲料, 农产品类别_菊花, 农产品类别_蔬菜, 农产品类别_配叶类, 农产品类别_配花类, 农产品类别_鲜肉, 市场名称映射值, 农产品名称映射值, 规格, 区域, 颜色, 单位, 最低交易价格, 平均交易价格, 最高交易价格, 数据入库年份, 数据入库月份, 数据入库日份, 数据发布年份, 数据发布月份, 数据发布日份]\n",
      "Index: []\n",
      "\n",
      "[0 rows x 51 columns]\n"
     ]
    }
   ],
   "source": [
    "# 处理缺失值\n",
    "df.replace(r'\\N', pd.NA, inplace=True)\n",
    "df.dropna(inplace=True)\n",
    "# 区域编码\n",
    "region_code_mapping = {  \n",
    "    '北京': 1, '天津': 1, '河北': 1, '辽宁': 1, '上海': 1, '江苏': 1, '浙江': 1, '福建': 1, '山东': 1, '广东': 1,\n",
    "    '山西': 2, '吉林': 2, '黑龙江': 2, '安徽': 2, '江西': 2, '河南': 2, '湖北': 2, '湖南': 2,\n",
    "    '内蒙古': 3, '广西': 3, '重庆': 3, '四川': 3, '贵州': 3, '云南': 3, '西藏': 3, '陕西': 3, '甘肃': 3, '青海': 3, '宁夏': 3, '新疆': 3\n",
    "}\n",
    "if '农产品市场所在省份' in df.columns:\n",
    "    df['区域'] = df['农产品市场所在省份'].map(region_code_mapping)\n",
    "else:\n",
    "    print(\"Column '农产品市场所在省份' does not exist in the DataFrame.\")\n",
    "\n",
    "# 独热编码\n",
    "if '农产品市场所在省份' in df.columns and '农产品类别' in df.columns:\n",
    "    one_hot_encoded_df = pd.get_dummies(df[['农产品市场所在省份', '农产品类别']], drop_first=True)\n",
    "    df = pd.concat([df, one_hot_encoded_df], axis=1)\n",
    "    df = df.drop(['农产品市场所在省份', '农产品类别'], axis=1)\n",
    "\n",
    "# 标签编码\n",
    "label_encoder = LabelEncoder()\n",
    "if '市场名称映射值' in df.columns:\n",
    "    unique_colors = df['市场名称映射值'].unique()\n",
    "    color_to_index = dict(zip(unique_colors, label_encoder.fit_transform(unique_colors)))\n",
    "    df['市场名称映射值'] = df['市场名称映射值'].map(color_to_index)\n",
    "\n",
    "if '农产品名称映射值' in df.columns:\n",
    "    unique_products = df['农产品名称映射值'].unique()\n",
    "    product_to_index = dict(zip(unique_products, label_encoder.fit_transform(unique_products)))\n",
    "    df['农产品名称映射值'] = df['农产品名称映射值'].map(product_to_index)\n",
    "\n",
    "# 日期处理\n",
    "if '数据入库时间' in df.columns and '数据发布时间' in df.columns:\n",
    "    df['数据入库时间'] = pd.to_datetime(df['数据入库时间'], errors='coerce')\n",
    "    df['数据发布时间'] = pd.to_datetime(df['数据发布时间'], errors='coerce')\n",
    "    df['数据入库年份'] = df['数据入库时间'].dt.year\n",
    "    df['数据入库月份'] = df['数据入库时间'].dt.month\n",
    "    df['数据入库日'] = df['数据入库时间'].dt.day\n",
    "    df['数据发布年份'] = df['数据发布时间'].dt.year\n",
    "    df['数据发布月份'] = df['数据发布时间'].dt.month\n",
    "    df['数据发布日'] = df['数据发布时间'].dt.day\n",
    "    df = df.drop(['数据入库时间', '数据发布时间'], axis=1)\n",
    "\n",
    "# 输出清洗后的数据\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "4ad4fd48",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DataFrame is empty. Check data loading and preprocessing steps.\n"
     ]
    }
   ],
   "source": [
    "if df.shape[0] > 0:\n",
    "    X = df.drop(['最低交易价格', '平均交易价格', '最高交易价格'], axis=1)\n",
    "    Y = df[['最低交易价格', '平均交易价格', '最高交易价格']]\n",
    "    \n",
    "    # 检查X和Y的形状\n",
    "    print(\"X shape:\", X.shape)\n",
    "    print(\"Y shape:\", Y.shape)\n",
    "    \n",
    "    if X.shape[0] > 0 and Y.shape[0] > 0:\n",
    "        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=536)\n",
    "        print(\"Train-test split successful.\")\n",
    "    else:\n",
    "        print(\"X or Y is empty after dropping columns. Check column names and data processing steps.\")\n",
    "else:\n",
    "    print(\"DataFrame is empty. Check data loading and preprocessing steps.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1223b2b9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "8c820840",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[62], line 4\u001b[0m\n\u001b[0;32m      2\u001b[0m X \u001b[38;5;241m=\u001b[39m df\u001b[38;5;241m.\u001b[39mdrop([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m最低交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m平均交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m最高交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m      3\u001b[0m Y \u001b[38;5;241m=\u001b[39m df[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m最低交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m平均交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m最高交易价格\u001b[39m\u001b[38;5;124m'\u001b[39m]]\n\u001b[1;32m----> 4\u001b[0m X_train, X_test, Y_train, Y_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m      5\u001b[0m \u001b[43m                                                    \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[0;32m      6\u001b[0m \u001b[43m                                                    \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m      7\u001b[0m \u001b[43m                                                    \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m536\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m      9\u001b[0m \u001b[38;5;66;03m# 建立随机森林回归模型\u001b[39;00m\n\u001b[0;32m     10\u001b[0m rf \u001b[38;5;241m=\u001b[39m RandomForestRegressor(n_estimators\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m321\u001b[39m, max_depth\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m25\u001b[39m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m536\u001b[39m)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2420\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m   2417\u001b[0m arrays \u001b[38;5;241m=\u001b[39m indexable(\u001b[38;5;241m*\u001b[39marrays)\n\u001b[0;32m   2419\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(arrays[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m-> 2420\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[43m_validate_shuffle_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2421\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_test_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.25\u001b[39;49m\n\u001b[0;32m   2422\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m shuffle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[0;32m   2425\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m stratify \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2098\u001b[0m, in \u001b[0;36m_validate_shuffle_split\u001b[1;34m(n_samples, test_size, train_size, default_test_size)\u001b[0m\n\u001b[0;32m   2095\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(n_train), \u001b[38;5;28mint\u001b[39m(n_test)\n\u001b[0;32m   2097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2098\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2099\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWith n_samples=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, test_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m and train_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2100\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresulting train set will be empty. Adjust any of the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2101\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maforementioned parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(n_samples, test_size, train_size)\n\u001b[0;32m   2102\u001b[0m     )\n\u001b[0;32m   2104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m n_train, n_test\n",
      "\u001b[1;31mValueError\u001b[0m: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters."
     ]
    }
   ],
   "source": [
    "# 准备数据集\n",
    "X = df.drop(['最低交易价格', '平均交易价格', '最高交易价格'], axis=1)\n",
    "Y = df[['最低交易价格', '平均交易价格', '最高交易价格']]\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X,\n",
    "                                                    Y, \n",
    "                                                    test_size=0.3,\n",
    "                                                    random_state=536)\n",
    "\n",
    "# 建立随机森林回归模型\n",
    "rf = RandomForestRegressor(n_estimators=321, max_depth=25, random_state=536)\n",
    "\n",
    "# 模型训练\n",
    "rf.fit(X_train, Y_train)\n",
    "\n",
    "# 模型预测\n",
    "Y_pred = rf.predict(X_test)\n",
    "\n",
    "# 模型评估\n",
    "mse = mean_squared_error(Y_test, Y_pred)\n",
    "r2 = r2_score(Y_test, Y_pred)\n",
    "score_f= rf.score(X_test,Y_test)\n",
    "\n",
    "#模型评估\n",
    "print('MSE',mean_squared_error(Y_test, Y_pred))\n",
    "print('MAE',mean_absolute_error(Y_test,Y_pred))\n",
    "print('Random Forest:{}'.format(score_f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08070317",
   "metadata": {},
   "outputs": [],
   "source": [
    "#拆分数据集\n",
    "X_train,X_test,Y_train, Y_test = train_test_split(X,\n",
    "                                                  Y,\n",
    "                                                  test_size=0.3,\n",
    "                                                  random_state=536)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b35d1b87",
   "metadata": {},
   "outputs": [],
   "source": [
    "rf = RandomForestRegressor(n_estimators=321,max_depth=25)#建立模型\n",
    "rf.fit(X_train, Y_train)\n",
    "pred = rf.predict(X_test)\n",
    "score_f= rf.score(X_test,Y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8746df01",
   "metadata": {},
   "outputs": [],
   "source": [
    "#模型评估\n",
    "print('MSE',mean_squared_error(Y_test, pred))\n",
    "print('MAE',mean_absolute_error(Y_test, pred))\n",
    "print('Random Forest:{}'.format(score_f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5801c869",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "1921af7b",
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[60], line 4\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# 假设X是特征集，Y是目标变量\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m X_train, X_test, Y_train, Y_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m536\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2420\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m   2417\u001b[0m arrays \u001b[38;5;241m=\u001b[39m indexable(\u001b[38;5;241m*\u001b[39marrays)\n\u001b[0;32m   2419\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(arrays[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m-> 2420\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[43m_validate_shuffle_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2421\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_test_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.25\u001b[39;49m\n\u001b[0;32m   2422\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m shuffle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[0;32m   2425\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m stratify \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2098\u001b[0m, in \u001b[0;36m_validate_shuffle_split\u001b[1;34m(n_samples, test_size, train_size, default_test_size)\u001b[0m\n\u001b[0;32m   2095\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(n_train), \u001b[38;5;28mint\u001b[39m(n_test)\n\u001b[0;32m   2097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2098\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2099\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWith n_samples=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, test_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m and train_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2100\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresulting train set will be empty. Adjust any of the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2101\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maforementioned parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(n_samples, test_size, train_size)\n\u001b[0;32m   2102\u001b[0m     )\n\u001b[0;32m   2104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m n_train, n_test\n",
      "\u001b[1;31mValueError\u001b[0m: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters."
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# 假设X是特征集，Y是目标变量\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=536)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "b459b867",
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[63], line 4\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[0;32m      3\u001b[0m \u001b[38;5;66;03m# 假设X是特征集，Y是目标变量\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m X_train, X_test, Y_train, Y_test \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_test_split\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mY\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_state\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m536\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2420\u001b[0m, in \u001b[0;36mtrain_test_split\u001b[1;34m(test_size, train_size, random_state, shuffle, stratify, *arrays)\u001b[0m\n\u001b[0;32m   2417\u001b[0m arrays \u001b[38;5;241m=\u001b[39m indexable(\u001b[38;5;241m*\u001b[39marrays)\n\u001b[0;32m   2419\u001b[0m n_samples \u001b[38;5;241m=\u001b[39m _num_samples(arrays[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m-> 2420\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[43m_validate_shuffle_split\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2421\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_samples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_test_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.25\u001b[39;49m\n\u001b[0;32m   2422\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   2424\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m shuffle \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[0;32m   2425\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m stratify \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\model_selection\\_split.py:2098\u001b[0m, in \u001b[0;36m_validate_shuffle_split\u001b[1;34m(n_samples, test_size, train_size, default_test_size)\u001b[0m\n\u001b[0;32m   2095\u001b[0m n_train, n_test \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mint\u001b[39m(n_train), \u001b[38;5;28mint\u001b[39m(n_test)\n\u001b[0;32m   2097\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m n_train \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2098\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m   2099\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWith n_samples=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, test_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m and train_size=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m, the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2100\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresulting train set will be empty. Adjust any of the \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   2101\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maforementioned parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(n_samples, test_size, train_size)\n\u001b[0;32m   2102\u001b[0m     )\n\u001b[0;32m   2104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m n_train, n_test\n",
      "\u001b[1;31mValueError\u001b[0m: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters."
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# 假设X是特征集，Y是目标变量\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=536)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "0ad2c99e",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (560453000.py, line 19)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;36m  Cell \u001b[1;32mIn[75], line 19\u001b[1;36m\u001b[0m\n\u001b[1;33m    pipeline =Pipeline(steps=[('preprocessor', preprocessor)])\u001b[0m\n\u001b[1;37m    ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "#假设'df 是你的 DataFrame#df=pd.read csv('your data file.csv')#如果你是从文件加载数据\n",
    "#定义分类特征和数值特征的列名\n",
    "categorical_features =[ '农产品市场所在省份','农产品类别','颜色','单位']\n",
    "numerical_features=['平均交易价格','最高交易价格']\n",
    "#创建一个 ColumpTransformer 来处理分类特征\n",
    "preprocessor =ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('cat',OneHotEncoder(sparse=False),categorical_features),\n",
    "        ('num','passthrough',numerical_features)\n",
    "#应用预处理器处理数据\n",
    "pipeline =Pipeline(steps=[('preprocessor', preprocessor)])\n",
    "processed_data=pipeline.fit_transform(df)\n",
    "#分制特征和目标变量，假设目标变量是，平均交易价格’\n",
    "Y=processed_data[:，2] #因为目标变量，平均交易价’是倒数第二列\n",
    "X=processed datal:,:2]\n",
    "#拆分数据集\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=536)\n",
    "rf = RandomForestRegressor(n_estimators=321,max_depth=25)#建立模型\n",
    "rf.fit(X_train, Y_train)\n",
    "pred = rf.predict(X_test)\n",
    "score_f= rf.score(X_test,Y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "721ea4df",
   "metadata": {},
   "outputs": [],
   "source": [
    "data3=pd.read_csv('farming.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "68917a67",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.398013632369532\n",
      "0.42808634018832514\n",
      "0.45283693546106957\n",
      "0.38689540662880206\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[71], line 39\u001b[0m\n\u001b[0;32m     36\u001b[0m \u001b[38;5;28mprint\u001b[39m((groud_truth1 \u001b[38;5;241m==\u001b[39m predict1)\u001b[38;5;241m.\u001b[39mmean())\n\u001b[0;32m     38\u001b[0m \u001b[38;5;66;03m# 重复上述步骤，分别对 y2 和 y3 进行预测和评估\u001b[39;00m\n\u001b[1;32m---> 39\u001b[0m \u001b[43malg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my2\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     40\u001b[0m predict2 \u001b[38;5;241m=\u001b[39m alg\u001b[38;5;241m.\u001b[39mpredict(data3[sell][\u001b[38;5;241m950001\u001b[39m:])\n\u001b[0;32m     41\u001b[0m results\u001b[38;5;241m.\u001b[39mappend((leaf_size, n_estimators_size, (groud_truth2 \u001b[38;5;241m==\u001b[39m predict2)\u001b[38;5;241m.\u001b[39mmean()))\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:450\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m    439\u001b[0m trees \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m    440\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_estimator(append\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39mrandom_state)\n\u001b[0;32m    441\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n_more_estimators)\n\u001b[0;32m    442\u001b[0m ]\n\u001b[0;32m    444\u001b[0m \u001b[38;5;66;03m# Parallel loop: we prefer the threading backend as the Cython code\u001b[39;00m\n\u001b[0;32m    445\u001b[0m \u001b[38;5;66;03m# for fitting the trees is internally releasing the Python GIL\u001b[39;00m\n\u001b[0;32m    446\u001b[0m \u001b[38;5;66;03m# making threading more efficient than multiprocessing in\u001b[39;00m\n\u001b[0;32m    447\u001b[0m \u001b[38;5;66;03m# that case. However, for joblib 0.12+ we respect any\u001b[39;00m\n\u001b[0;32m    448\u001b[0m \u001b[38;5;66;03m# parallel_backend contexts set at a higher level,\u001b[39;00m\n\u001b[0;32m    449\u001b[0m \u001b[38;5;66;03m# since correctness does not rely on using threads.\u001b[39;00m\n\u001b[1;32m--> 450\u001b[0m trees \u001b[38;5;241m=\u001b[39m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    451\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    452\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    453\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m_joblib_parallel_args\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprefer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthreads\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    454\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    455\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_parallel_build_trees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    456\u001b[0m \u001b[43m        \u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    457\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    458\u001b[0m \u001b[43m        \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    459\u001b[0m \u001b[43m        \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    460\u001b[0m \u001b[43m        \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    461\u001b[0m \u001b[43m        \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    462\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtrees\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    463\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mverbose\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    464\u001b[0m \u001b[43m        \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclass_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    465\u001b[0m \u001b[43m        \u001b[49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_samples_bootstrap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    466\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    467\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtrees\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    468\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    470\u001b[0m \u001b[38;5;66;03m# Collect newly grown trees\u001b[39;00m\n\u001b[0;32m    471\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimators_\u001b[38;5;241m.\u001b[39mextend(trees)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1916\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m   1917\u001b[0m     \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m   1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m   1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m   1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m   1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m   1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\utils\\fixes.py:216\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m    215\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig):\n\u001b[1;32m--> 216\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\ensemble\\_forest.py:185\u001b[0m, in \u001b[0;36m_parallel_build_trees\u001b[1;34m(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight, n_samples_bootstrap)\u001b[0m\n\u001b[0;32m    182\u001b[0m     \u001b[38;5;28;01melif\u001b[39;00m class_weight \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced_subsample\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m    183\u001b[0m         curr_sample_weight \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m=\u001b[39m compute_sample_weight(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbalanced\u001b[39m\u001b[38;5;124m\"\u001b[39m, y, indices\u001b[38;5;241m=\u001b[39mindices)\n\u001b[1;32m--> 185\u001b[0m     \u001b[43mtree\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurr_sample_weight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcheck_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m    186\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    187\u001b[0m     tree\u001b[38;5;241m.\u001b[39mfit(X, y, sample_weight\u001b[38;5;241m=\u001b[39msample_weight, check_input\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\tree\\_classes.py:1315\u001b[0m, in \u001b[0;36mDecisionTreeRegressor.fit\u001b[1;34m(self, X, y, sample_weight, check_input, X_idx_sorted)\u001b[0m\n\u001b[0;32m   1278\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfit\u001b[39m(\n\u001b[0;32m   1279\u001b[0m     \u001b[38;5;28mself\u001b[39m, X, y, sample_weight\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, check_input\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, X_idx_sorted\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeprecated\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1280\u001b[0m ):\n\u001b[0;32m   1281\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Build a decision tree regressor from the training set (X, y).\u001b[39;00m\n\u001b[0;32m   1282\u001b[0m \n\u001b[0;32m   1283\u001b[0m \u001b[38;5;124;03m    Parameters\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   1312\u001b[0m \u001b[38;5;124;03m        Fitted estimator.\u001b[39;00m\n\u001b[0;32m   1313\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m-> 1315\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1316\u001b[0m \u001b[43m        \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1317\u001b[0m \u001b[43m        \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1318\u001b[0m \u001b[43m        \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1319\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck_input\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1320\u001b[0m \u001b[43m        \u001b[49m\u001b[43mX_idx_sorted\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX_idx_sorted\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   1321\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1322\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\tree\\_classes.py:420\u001b[0m, in \u001b[0;36mBaseDecisionTree.fit\u001b[1;34m(self, X, y, sample_weight, check_input, X_idx_sorted)\u001b[0m\n\u001b[0;32m    409\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    410\u001b[0m     builder \u001b[38;5;241m=\u001b[39m BestFirstTreeBuilder(\n\u001b[0;32m    411\u001b[0m         splitter,\n\u001b[0;32m    412\u001b[0m         min_samples_split,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    417\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmin_impurity_decrease,\n\u001b[0;32m    418\u001b[0m     )\n\u001b[1;32m--> 420\u001b[0m \u001b[43mbuilder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtree_\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    422\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_outputs_ \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m is_classifier(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m    423\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_classes_[\u001b[38;5;241m0\u001b[39m]\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "# 假设 data3 是您的 DataFrame，sell 是特征列\n",
    "sell = [\"市场名称映射值\", \"农产品名称映射值\", \"区域\", \"颜色\", \"单位\", \"最低交易价格\", \"平均交易价格\", \"最高交易价格\"]\n",
    "\n",
    "# 准备数据\n",
    "X = data3[sell][:950000]  # 特征\n",
    "y1 = data3['最低交易价格'][:950000]  # 目标变量1\n",
    "y2 = data3['平均交易价格'][:950000]  # 目标变量2\n",
    "y3 = data3['最高交易价格'][:950000]  # 目标变量3\n",
    "\n",
    "# 存放不同参数取值，以及对应的精度\n",
    "results = []\n",
    "\n",
    "# 最小叶子结点的参数取值\n",
    "sample_leaf_options = list(range(1, 500, 3))\n",
    "# 决策树个数参数取值\n",
    "n_estimators_options = list(range(1, 1000, 5))\n",
    "\n",
    "# 真实值\n",
    "groud_truth1 = data3['最低交易价格'][950001:]\n",
    "groud_truth2 = data3['平均交易价格'][950001:]\n",
    "groud_truth3 = data3['最高交易价格'][950001:]\n",
    "\n",
    "for leaf_size in sample_leaf_options:\n",
    "    for n_estimators_size in n_estimators_options:\n",
    "        # 使用随机森林回归模型\n",
    "        alg = RandomForestRegressor(min_samples_leaf=leaf_size, n_estimators=n_estimators_size, random_state=50)\n",
    "        alg.fit(X, y1)  # 训练模型\n",
    "        predict1 = alg.predict(data3[sell][950001:])  # 预测\n",
    "        results.append((leaf_size, n_estimators_size, (groud_truth1 == predict1).mean()))\n",
    "        print((groud_truth1 == predict1).mean())\n",
    "\n",
    "        # 重复上述步骤，分别对 y2 和 y3 进行预测和评估\n",
    "        alg.fit(X, y2)\n",
    "        predict2 = alg.predict(data3[sell][950001:])\n",
    "        results.append((leaf_size, n_estimators_size, (groud_truth2 == predict2).mean()))\n",
    "        print((groud_truth2 == predict2).mean())\n",
    "\n",
    "        alg.fit(X, y3)\n",
    "        predict3 = alg.predict(data3[sell][950001:])\n",
    "        results.append((leaf_size, n_estimators_size, (groud_truth3 == predict3).mean()))\n",
    "        print((groud_truth3 == predict3).mean())\n",
    "\n",
    "# 打印精度最大的那一个三元组\n",
    "print(max(results, key=lambda x: x[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62d906b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "results.append((leaf_size, n_estimators_size, (groud_truth == predict).mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f7ebab4f",
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "Unable to allocate 12.9 GiB for an array with shape (1310760, 1326) and data type float64",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[3], line 25\u001b[0m\n\u001b[0;32m     23\u001b[0m \u001b[38;5;66;03m# 应用预处理器处理数据\u001b[39;00m\n\u001b[0;32m     24\u001b[0m pipeline \u001b[38;5;241m=\u001b[39m Pipeline(steps\u001b[38;5;241m=\u001b[39m[(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpreprocessor\u001b[39m\u001b[38;5;124m'\u001b[39m, preprocessor)])\n\u001b[1;32m---> 25\u001b[0m processed_data \u001b[38;5;241m=\u001b[39m \u001b[43mpipeline\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     27\u001b[0m \u001b[38;5;66;03m# 分割特征和目标变量\u001b[39;00m\n\u001b[0;32m     28\u001b[0m \u001b[38;5;66;03m# 假设目标变量是 '平均交易价格'\u001b[39;00m\n\u001b[0;32m     29\u001b[0m Y \u001b[38;5;241m=\u001b[39m processed_data[:, \u001b[38;5;241m2\u001b[39m]  \u001b[38;5;66;03m# 因为目标变量 '平均交易价格' 是倒数第二列\u001b[39;00m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\pipeline.py:434\u001b[0m, in \u001b[0;36mPipeline.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    432\u001b[0m fit_params_last_step \u001b[38;5;241m=\u001b[39m fit_params_steps[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msteps[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m][\u001b[38;5;241m0\u001b[39m]]\n\u001b[0;32m    433\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(last_step, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 434\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m last_step\u001b[38;5;241m.\u001b[39mfit_transform(Xt, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params_last_step)\n\u001b[0;32m    435\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    436\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m last_step\u001b[38;5;241m.\u001b[39mfit(Xt, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params_last_step)\u001b[38;5;241m.\u001b[39mtransform(Xt)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py:675\u001b[0m, in \u001b[0;36mColumnTransformer.fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    672\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_column_callables(X)\n\u001b[0;32m    673\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_remainder(X)\n\u001b[1;32m--> 675\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_fit_transform_one\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    677\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m result:\n\u001b[0;32m    678\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_fitted_transformers([])\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\compose\\_column_transformer.py:606\u001b[0m, in \u001b[0;36mColumnTransformer._fit_transform\u001b[1;34m(self, X, y, func, fitted, column_as_strings)\u001b[0m\n\u001b[0;32m    600\u001b[0m transformers \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\n\u001b[0;32m    601\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_iter(\n\u001b[0;32m    602\u001b[0m         fitted\u001b[38;5;241m=\u001b[39mfitted, replace_strings\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, column_as_strings\u001b[38;5;241m=\u001b[39mcolumn_as_strings\n\u001b[0;32m    603\u001b[0m     )\n\u001b[0;32m    604\u001b[0m )\n\u001b[0;32m    605\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 606\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mParallel\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    607\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdelayed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    608\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtransformer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclone\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrans\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfitted\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mtrans\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    609\u001b[0m \u001b[43m            \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_safe_indexing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    610\u001b[0m \u001b[43m            \u001b[49m\u001b[43my\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    611\u001b[0m \u001b[43m            \u001b[49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    612\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmessage_clsname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mColumnTransformer\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m    613\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmessage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_message\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtransformers\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    614\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    615\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrans\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43menumerate\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtransformers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m    616\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    617\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m    618\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected 2D array, got 1D array instead\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(e):\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\joblib\\parallel.py:1918\u001b[0m, in \u001b[0;36mParallel.__call__\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1916\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_sequential_output(iterable)\n\u001b[0;32m   1917\u001b[0m     \u001b[38;5;28mnext\u001b[39m(output)\n\u001b[1;32m-> 1918\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m output \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturn_generator \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1920\u001b[0m \u001b[38;5;66;03m# Let's create an ID that uniquely identifies the current call. If the\u001b[39;00m\n\u001b[0;32m   1921\u001b[0m \u001b[38;5;66;03m# call is interrupted early and that the same instance is immediately\u001b[39;00m\n\u001b[0;32m   1922\u001b[0m \u001b[38;5;66;03m# re-used, this id will be used to prevent workers that were\u001b[39;00m\n\u001b[0;32m   1923\u001b[0m \u001b[38;5;66;03m# concurrently finalizing a task from the previous call to run the\u001b[39;00m\n\u001b[0;32m   1924\u001b[0m \u001b[38;5;66;03m# callback.\u001b[39;00m\n\u001b[0;32m   1925\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lock:\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\joblib\\parallel.py:1847\u001b[0m, in \u001b[0;36mParallel._get_sequential_output\u001b[1;34m(self, iterable)\u001b[0m\n\u001b[0;32m   1845\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_batches \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   1846\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_dispatched_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m-> 1847\u001b[0m res \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m   1848\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_completed_tasks \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m   1849\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_progress()\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\utils\\fixes.py:216\u001b[0m, in \u001b[0;36m_FuncWrapper.__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m    214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m    215\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m config_context(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig):\n\u001b[1;32m--> 216\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfunction(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\pipeline.py:893\u001b[0m, in \u001b[0;36m_fit_transform_one\u001b[1;34m(transformer, X, y, weight, message_clsname, message, **fit_params)\u001b[0m\n\u001b[0;32m    891\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _print_elapsed_time(message_clsname, message):\n\u001b[0;32m    892\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(transformer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfit_transform\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m--> 893\u001b[0m         res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit_transform(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\n\u001b[0;32m    894\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    895\u001b[0m         res \u001b[38;5;241m=\u001b[39m transformer\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:488\u001b[0m, in \u001b[0;36mOneHotEncoder.fit_transform\u001b[1;34m(self, X, y)\u001b[0m\n\u001b[0;32m    466\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    467\u001b[0m \u001b[38;5;124;03mFit OneHotEncoder to X, then transform X.\u001b[39;00m\n\u001b[0;32m    468\u001b[0m \n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    485\u001b[0m \u001b[38;5;124;03m    returned.\u001b[39;00m\n\u001b[0;32m    486\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m    487\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_keywords()\n\u001b[1;32m--> 488\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_transform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\base.py:852\u001b[0m, in \u001b[0;36mTransformerMixin.fit_transform\u001b[1;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[0;32m    848\u001b[0m \u001b[38;5;66;03m# non-optimized default implementation; override when a better\u001b[39;00m\n\u001b[0;32m    849\u001b[0m \u001b[38;5;66;03m# method is possible for a given clustering algorithm\u001b[39;00m\n\u001b[0;32m    850\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    851\u001b[0m     \u001b[38;5;66;03m# fit method of arity 1 (unsupervised transformation)\u001b[39;00m\n\u001b[1;32m--> 852\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfit_params\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    853\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    854\u001b[0m     \u001b[38;5;66;03m# fit method of arity 2 (supervised transformation)\u001b[39;00m\n\u001b[0;32m    855\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit(X, y, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfit_params)\u001b[38;5;241m.\u001b[39mtransform(X)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:557\u001b[0m, in \u001b[0;36mOneHotEncoder.transform\u001b[1;34m(self, X)\u001b[0m\n\u001b[0;32m    551\u001b[0m out \u001b[38;5;241m=\u001b[39m sparse\u001b[38;5;241m.\u001b[39mcsr_matrix(\n\u001b[0;32m    552\u001b[0m     (data, indices, indptr),\n\u001b[0;32m    553\u001b[0m     shape\u001b[38;5;241m=\u001b[39m(n_samples, feature_indices[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]),\n\u001b[0;32m    554\u001b[0m     dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[0;32m    555\u001b[0m )\n\u001b[0;32m    556\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msparse:\n\u001b[1;32m--> 557\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mout\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    558\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    559\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\scipy\\sparse\\_compressed.py:1051\u001b[0m, in \u001b[0;36m_cs_matrix.toarray\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m   1049\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m out \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m   1050\u001b[0m     order \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_swap(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcf\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m-> 1051\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_toarray_args\u001b[49m\u001b[43m(\u001b[49m\u001b[43morder\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1052\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (out\u001b[38;5;241m.\u001b[39mflags\u001b[38;5;241m.\u001b[39mc_contiguous \u001b[38;5;129;01mor\u001b[39;00m out\u001b[38;5;241m.\u001b[39mflags\u001b[38;5;241m.\u001b[39mf_contiguous):\n\u001b[0;32m   1053\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mOutput array must be C or F contiguous\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
      "File \u001b[1;32mD:\\python\\Anaconda\\lib\\site-packages\\scipy\\sparse\\_base.py:1298\u001b[0m, in \u001b[0;36mspmatrix._process_toarray_args\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m   1296\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m out\n\u001b[0;32m   1297\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 1298\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzeros\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshape\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43morder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morder\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mMemoryError\u001b[0m: Unable to allocate 12.9 GiB for an array with shape (1310760, 1326) and data type float64"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "\n",
    "# 假设 'df' 是你的 DataFrame\n",
    "# df = pd.read_csv('your_data_file.csv')  # 如果你是从文件加载数据\n",
    "\n",
    "# 定义分类特征和数值特征的列名\n",
    "categorical_features = ['农产品名称映射值', '区域', '颜色', '单位']\n",
    "numerical_features = ['平均交易价格', '最高交易价格']\n",
    "\n",
    "# 创建一个 ColumnTransformer 来处理分类特征\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('cat', OneHotEncoder(sparse=False), categorical_features),  # 设置 sparse_output=False\n",
    "        ('num', 'passthrough', numerical_features)\n",
    "    ])\n",
    "\n",
    "# 应用预处理器处理数据\n",
    "pipeline = Pipeline(steps=[('preprocessor', preprocessor)])\n",
    "processed_data = pipeline.fit_transform(df)\n",
    "\n",
    "# 分割特征和目标变量\n",
    "# 假设目标变量是 '平均交易价格'\n",
    "Y = processed_data[:, 2]  # 因为目标变量 '平均交易价格' 是倒数第二列\n",
    "X = processed_data[:, :2]\n",
    "\n",
    "# 拆分数据集\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=536)\n",
    "\n",
    "# 建立模型\n",
    "rf = RandomForestRegressor(n_estimators=321, max_depth=25)\n",
    "\n",
    "# 模型训练\n",
    "rf.fit(X_train, Y_train)\n",
    "\n",
    "# 使用模型进行预测\n",
    "Y_pred = rf.predict(X_test)\n",
    "\n",
    "# 打印预测结果\n",
    "print(\"Predicted values:\")\n",
    "print(Y_pred)\n",
    "\n",
    "# 比较预测结果与实际值\n",
    "print(\"\\nActual values:\")\n",
    "print(Y_test)\n",
    "\n",
    "# 计算评价指标\n",
    "mse = mean_squared_error(Y_test, Y_pred)\n",
    "r2 = r2_score(Y_test, Y_pred)\n",
    "\n",
    "print(f\"\\nMean Squared Error (MSE): {mse}\")\n",
    "print(f\"R-squared (R²): {r2}\")\n",
    "\n",
    "# 模型评估（可选）\n",
    "score = rf.score(X_test, Y_test)\n",
    "print(f\"\\nModel score (R² on test data): {score}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd544ab1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
